{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# CTR Prediction\n",
    "\n",
    "\thttps://www.kaggle.com/c/avazu-ctr-prediction/data\n",
    "\n",
    "## File descriptions\n",
    "**train** - Training set. 10 days of click-through data, ordered chronologically. Non-clicks and clicks are subsampled according to different strategies. \n",
    "\n",
    "\thttps://www.kaggle.com/c/avazu-ctr-prediction/download/train.gz\n",
    "\n",
    "**test** - Test set. 1 day of ads to for testing your model predictions. \n",
    "\n",
    "\thttps://www.kaggle.com/c/avazu-ctr-prediction/download/test.gz\n",
    "\n",
    "**sampleSubmission.csv** - Sample submission file in the correct format, corresponds to the All-0.5 Benchmark. \n",
    "\n",
    "\thttps://www.kaggle.com/c/avazu-ctr-prediction/download/sampleSubmission.gz\n",
    "\n",
    "## Data fields\n",
    "id: ad identifier\n",
    "click: 0/1 for non-click/click\n",
    "hour: format is YYMMDDHH, so 14091123 means 23:00 on Sept. 11, 2014 UTC.\n",
    "\n",
    "C1 -- anonymized categorical variable, banner_pos, site_id, site_domain, site_category, app_id, app_domain, app_category, device_id, device_ip, device_model, device_type, device_conn_type, C14-C21 -- anonymized categorical variables\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# When Python imports but IPython does not\n",
    "\n",
    "https://www.lucypark.kr/blog/2013/02/10/when-python-imports-and-ipython-does-not/\n",
    "\n",
    "\n",
    "https://ask.hellobi.com/blog/ysfyb/14174"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-10-17T07:17:04.896514Z",
     "start_time": "2018-10-17T07:17:04.880246Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['',\n '/Users/songtao/pvenv_python2.7/lib/python27.zip',\n '/Users/songtao/pvenv_python2.7/lib/python2.7',\n '/Users/songtao/pvenv_python2.7/lib/python2.7/plat-darwin',\n '/Users/songtao/pvenv_python2.7/lib/python2.7/plat-mac',\n '/Users/songtao/pvenv_python2.7/lib/python2.7/plat-mac/lib-scriptpackages',\n '/Users/songtao/pvenv_python2.7/lib/python2.7/lib-tk',\n '/Users/songtao/pvenv_python2.7/lib/python2.7/lib-old',\n '/Users/songtao/pvenv_python2.7/lib/python2.7/lib-dynload',\n '/usr/local/Cellar/python@2/2.7.15_1/Frameworks/Python.framework/Versions/2.7/lib/python2.7',\n '/usr/local/Cellar/python@2/2.7.15_1/Frameworks/Python.framework/Versions/2.7/lib/python2.7/plat-darwin',\n '/usr/local/Cellar/python@2/2.7.15_1/Frameworks/Python.framework/Versions/2.7/lib/python2.7/lib-tk',\n '/usr/local/Cellar/python@2/2.7.15_1/Frameworks/Python.framework/Versions/2.7/lib/python2.7/plat-mac',\n '/usr/local/Cellar/python@2/2.7.15_1/Frameworks/Python.framework/Versions/2.7/lib/python2.7/plat-mac/lib-scriptpackages',\n '/Users/songtao/pvenv_python2.7/lib/python2.7/site-packages',\n '/Applications/PyCharm.app/Contents/helpers/pycharm_matplotlib_backend',\n '/Users/songtao/pvenv_python2.7/lib/python2.7/site-packages/IPython/extensions',\n '/Users/songtao/.ipython',\n '/Users/songtao/personaldriveMac/ai_project/ai_csdn_20180917/ctr_prediction/src']"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import sys\n",
    "\n",
    "sys.path.append(\"/Users/songtao/personaldriveMac/ai_project/ai_csdn_20180917/ctr_prediction/src\")\n",
    "\n",
    "sys.path"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Load Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-10-17T07:19:43.435275Z",
     "start_time": "2018-10-17T07:17:04.930242Z"
    }
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "\n",
    "# Initial setup\n",
    "# train_filename = \"train_small.csv\"\n",
    "train_filename = \"/Users/songtao/personaldriveMac/ai_project/ai_csdn_20180917/ctr_prediction/datasets/train/train\"\n",
    "\n",
    "# test_filename = \"test.csv\"\n",
    "test_filename = \"/Users/songtao/personaldriveMac/ai_project/ai_csdn_20180917/ctr_prediction/datasets/test/test\"\n",
    "submission_filename = \"/Users/songtao/personaldriveMac/ai_project/ai_csdn_20180917/ctr_prediction/out/submit.csv\"\n",
    "\n",
    "training_set = pd.read_csv(train_filename)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Explore Data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## head"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-10-17T07:19:43.513023Z",
     "start_time": "2018-10-17T07:19:43.450336Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>click</th>\n",
       "      <th>hour</th>\n",
       "      <th>C1</th>\n",
       "      <th>banner_pos</th>\n",
       "      <th>site_id</th>\n",
       "      <th>site_domain</th>\n",
       "      <th>site_category</th>\n",
       "      <th>app_id</th>\n",
       "      <th>app_domain</th>\n",
       "      <th>...</th>\n",
       "      <th>device_type</th>\n",
       "      <th>device_conn_type</th>\n",
       "      <th>C14</th>\n",
       "      <th>C15</th>\n",
       "      <th>C16</th>\n",
       "      <th>C17</th>\n",
       "      <th>C18</th>\n",
       "      <th>C19</th>\n",
       "      <th>C20</th>\n",
       "      <th>C21</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1.000009e+18</td>\n",
       "      <td>0</td>\n",
       "      <td>14102100</td>\n",
       "      <td>1005</td>\n",
       "      <td>0</td>\n",
       "      <td>1fbe01fe</td>\n",
       "      <td>f3845767</td>\n",
       "      <td>28905ebd</td>\n",
       "      <td>ecad2386</td>\n",
       "      <td>7801e8d9</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>15706</td>\n",
       "      <td>320</td>\n",
       "      <td>50</td>\n",
       "      <td>1722</td>\n",
       "      <td>0</td>\n",
       "      <td>35</td>\n",
       "      <td>-1</td>\n",
       "      <td>79</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1.000017e+19</td>\n",
       "      <td>0</td>\n",
       "      <td>14102100</td>\n",
       "      <td>1005</td>\n",
       "      <td>0</td>\n",
       "      <td>1fbe01fe</td>\n",
       "      <td>f3845767</td>\n",
       "      <td>28905ebd</td>\n",
       "      <td>ecad2386</td>\n",
       "      <td>7801e8d9</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>15704</td>\n",
       "      <td>320</td>\n",
       "      <td>50</td>\n",
       "      <td>1722</td>\n",
       "      <td>0</td>\n",
       "      <td>35</td>\n",
       "      <td>100084</td>\n",
       "      <td>79</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1.000037e+19</td>\n",
       "      <td>0</td>\n",
       "      <td>14102100</td>\n",
       "      <td>1005</td>\n",
       "      <td>0</td>\n",
       "      <td>1fbe01fe</td>\n",
       "      <td>f3845767</td>\n",
       "      <td>28905ebd</td>\n",
       "      <td>ecad2386</td>\n",
       "      <td>7801e8d9</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>15704</td>\n",
       "      <td>320</td>\n",
       "      <td>50</td>\n",
       "      <td>1722</td>\n",
       "      <td>0</td>\n",
       "      <td>35</td>\n",
       "      <td>100084</td>\n",
       "      <td>79</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1.000064e+19</td>\n",
       "      <td>0</td>\n",
       "      <td>14102100</td>\n",
       "      <td>1005</td>\n",
       "      <td>0</td>\n",
       "      <td>1fbe01fe</td>\n",
       "      <td>f3845767</td>\n",
       "      <td>28905ebd</td>\n",
       "      <td>ecad2386</td>\n",
       "      <td>7801e8d9</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>15706</td>\n",
       "      <td>320</td>\n",
       "      <td>50</td>\n",
       "      <td>1722</td>\n",
       "      <td>0</td>\n",
       "      <td>35</td>\n",
       "      <td>100084</td>\n",
       "      <td>79</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1.000068e+19</td>\n",
       "      <td>0</td>\n",
       "      <td>14102100</td>\n",
       "      <td>1005</td>\n",
       "      <td>1</td>\n",
       "      <td>fe8cc448</td>\n",
       "      <td>9166c161</td>\n",
       "      <td>0569f928</td>\n",
       "      <td>ecad2386</td>\n",
       "      <td>7801e8d9</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>18993</td>\n",
       "      <td>320</td>\n",
       "      <td>50</td>\n",
       "      <td>2161</td>\n",
       "      <td>0</td>\n",
       "      <td>35</td>\n",
       "      <td>-1</td>\n",
       "      <td>157</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 24 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "             id  click      hour    C1  banner_pos   site_id site_domain  \\\n",
       "0  1.000009e+18      0  14102100  1005           0  1fbe01fe    f3845767   \n",
       "1  1.000017e+19      0  14102100  1005           0  1fbe01fe    f3845767   \n",
       "2  1.000037e+19      0  14102100  1005           0  1fbe01fe    f3845767   \n",
       "3  1.000064e+19      0  14102100  1005           0  1fbe01fe    f3845767   \n",
       "4  1.000068e+19      0  14102100  1005           1  fe8cc448    9166c161   \n",
       "\n",
       "  site_category    app_id app_domain ...  device_type device_conn_type    C14  \\\n",
       "0      28905ebd  ecad2386   7801e8d9 ...            1                2  15706   \n",
       "1      28905ebd  ecad2386   7801e8d9 ...            1                0  15704   \n",
       "2      28905ebd  ecad2386   7801e8d9 ...            1                0  15704   \n",
       "3      28905ebd  ecad2386   7801e8d9 ...            1                0  15706   \n",
       "4      0569f928  ecad2386   7801e8d9 ...            1                0  18993   \n",
       "\n",
       "   C15  C16   C17  C18  C19     C20  C21  \n",
       "0  320   50  1722    0   35      -1   79  \n",
       "1  320   50  1722    0   35  100084   79  \n",
       "2  320   50  1722    0   35  100084   79  \n",
       "3  320   50  1722    0   35  100084   79  \n",
       "4  320   50  2161    0   35      -1  157  \n",
       "\n",
       "[5 rows x 24 columns]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "training_set.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-10-17T07:12:37.445436Z",
     "start_time": "2018-10-17T07:12:37.441396Z"
    }
   },
   "source": [
    "## describe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-10-17T07:20:57.110125Z",
     "start_time": "2018-10-17T07:19:43.517538Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>click</th>\n",
       "      <th>hour</th>\n",
       "      <th>C1</th>\n",
       "      <th>banner_pos</th>\n",
       "      <th>device_type</th>\n",
       "      <th>device_conn_type</th>\n",
       "      <th>C14</th>\n",
       "      <th>C15</th>\n",
       "      <th>C16</th>\n",
       "      <th>C17</th>\n",
       "      <th>C18</th>\n",
       "      <th>C19</th>\n",
       "      <th>C20</th>\n",
       "      <th>C21</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>4.042897e+07</td>\n",
       "      <td>4.042897e+07</td>\n",
       "      <td>4.042897e+07</td>\n",
       "      <td>4.042897e+07</td>\n",
       "      <td>4.042897e+07</td>\n",
       "      <td>4.042897e+07</td>\n",
       "      <td>4.042897e+07</td>\n",
       "      <td>4.042897e+07</td>\n",
       "      <td>4.042897e+07</td>\n",
       "      <td>4.042897e+07</td>\n",
       "      <td>4.042897e+07</td>\n",
       "      <td>4.042897e+07</td>\n",
       "      <td>4.042897e+07</td>\n",
       "      <td>4.042897e+07</td>\n",
       "      <td>4.042897e+07</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>9.223017e+18</td>\n",
       "      <td>1.698056e-01</td>\n",
       "      <td>1.410256e+07</td>\n",
       "      <td>1.004968e+03</td>\n",
       "      <td>2.880146e-01</td>\n",
       "      <td>1.015305e+00</td>\n",
       "      <td>3.313150e-01</td>\n",
       "      <td>1.884181e+04</td>\n",
       "      <td>3.188831e+02</td>\n",
       "      <td>6.010201e+01</td>\n",
       "      <td>2.112601e+03</td>\n",
       "      <td>1.432499e+00</td>\n",
       "      <td>2.271444e+02</td>\n",
       "      <td>5.321685e+04</td>\n",
       "      <td>8.338229e+01</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>5.325443e+18</td>\n",
       "      <td>3.754620e-01</td>\n",
       "      <td>2.966837e+02</td>\n",
       "      <td>1.094586e+00</td>\n",
       "      <td>5.063820e-01</td>\n",
       "      <td>5.274336e-01</td>\n",
       "      <td>8.547935e-01</td>\n",
       "      <td>4.959457e+03</td>\n",
       "      <td>2.127250e+01</td>\n",
       "      <td>4.729538e+01</td>\n",
       "      <td>6.094124e+02</td>\n",
       "      <td>1.326227e+00</td>\n",
       "      <td>3.510221e+02</td>\n",
       "      <td>4.995682e+04</td>\n",
       "      <td>7.028996e+01</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>5.211594e+11</td>\n",
       "      <td>0.000000e+00</td>\n",
       "      <td>1.410210e+07</td>\n",
       "      <td>1.001000e+03</td>\n",
       "      <td>0.000000e+00</td>\n",
       "      <td>0.000000e+00</td>\n",
       "      <td>0.000000e+00</td>\n",
       "      <td>3.750000e+02</td>\n",
       "      <td>1.200000e+02</td>\n",
       "      <td>2.000000e+01</td>\n",
       "      <td>1.120000e+02</td>\n",
       "      <td>0.000000e+00</td>\n",
       "      <td>3.300000e+01</td>\n",
       "      <td>-1.000000e+00</td>\n",
       "      <td>1.000000e+00</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>4.611181e+18</td>\n",
       "      <td>0.000000e+00</td>\n",
       "      <td>1.410230e+07</td>\n",
       "      <td>1.005000e+03</td>\n",
       "      <td>0.000000e+00</td>\n",
       "      <td>1.000000e+00</td>\n",
       "      <td>0.000000e+00</td>\n",
       "      <td>1.692000e+04</td>\n",
       "      <td>3.200000e+02</td>\n",
       "      <td>5.000000e+01</td>\n",
       "      <td>1.863000e+03</td>\n",
       "      <td>0.000000e+00</td>\n",
       "      <td>3.500000e+01</td>\n",
       "      <td>-1.000000e+00</td>\n",
       "      <td>2.300000e+01</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>9.223224e+18</td>\n",
       "      <td>0.000000e+00</td>\n",
       "      <td>1.410260e+07</td>\n",
       "      <td>1.005000e+03</td>\n",
       "      <td>0.000000e+00</td>\n",
       "      <td>1.000000e+00</td>\n",
       "      <td>0.000000e+00</td>\n",
       "      <td>2.034600e+04</td>\n",
       "      <td>3.200000e+02</td>\n",
       "      <td>5.000000e+01</td>\n",
       "      <td>2.323000e+03</td>\n",
       "      <td>2.000000e+00</td>\n",
       "      <td>3.900000e+01</td>\n",
       "      <td>1.000480e+05</td>\n",
       "      <td>6.100000e+01</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>1.383561e+19</td>\n",
       "      <td>0.000000e+00</td>\n",
       "      <td>1.410281e+07</td>\n",
       "      <td>1.005000e+03</td>\n",
       "      <td>1.000000e+00</td>\n",
       "      <td>1.000000e+00</td>\n",
       "      <td>0.000000e+00</td>\n",
       "      <td>2.189400e+04</td>\n",
       "      <td>3.200000e+02</td>\n",
       "      <td>5.000000e+01</td>\n",
       "      <td>2.526000e+03</td>\n",
       "      <td>3.000000e+00</td>\n",
       "      <td>1.710000e+02</td>\n",
       "      <td>1.000930e+05</td>\n",
       "      <td>1.010000e+02</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>1.844674e+19</td>\n",
       "      <td>1.000000e+00</td>\n",
       "      <td>1.410302e+07</td>\n",
       "      <td>1.012000e+03</td>\n",
       "      <td>7.000000e+00</td>\n",
       "      <td>5.000000e+00</td>\n",
       "      <td>5.000000e+00</td>\n",
       "      <td>2.405200e+04</td>\n",
       "      <td>1.024000e+03</td>\n",
       "      <td>1.024000e+03</td>\n",
       "      <td>2.758000e+03</td>\n",
       "      <td>3.000000e+00</td>\n",
       "      <td>1.959000e+03</td>\n",
       "      <td>1.002480e+05</td>\n",
       "      <td>2.550000e+02</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                 id         click          hour            C1    banner_pos  \\\n",
       "count  4.042897e+07  4.042897e+07  4.042897e+07  4.042897e+07  4.042897e+07   \n",
       "mean   9.223017e+18  1.698056e-01  1.410256e+07  1.004968e+03  2.880146e-01   \n",
       "std    5.325443e+18  3.754620e-01  2.966837e+02  1.094586e+00  5.063820e-01   \n",
       "min    5.211594e+11  0.000000e+00  1.410210e+07  1.001000e+03  0.000000e+00   \n",
       "25%    4.611181e+18  0.000000e+00  1.410230e+07  1.005000e+03  0.000000e+00   \n",
       "50%    9.223224e+18  0.000000e+00  1.410260e+07  1.005000e+03  0.000000e+00   \n",
       "75%    1.383561e+19  0.000000e+00  1.410281e+07  1.005000e+03  1.000000e+00   \n",
       "max    1.844674e+19  1.000000e+00  1.410302e+07  1.012000e+03  7.000000e+00   \n",
       "\n",
       "        device_type  device_conn_type           C14           C15  \\\n",
       "count  4.042897e+07      4.042897e+07  4.042897e+07  4.042897e+07   \n",
       "mean   1.015305e+00      3.313150e-01  1.884181e+04  3.188831e+02   \n",
       "std    5.274336e-01      8.547935e-01  4.959457e+03  2.127250e+01   \n",
       "min    0.000000e+00      0.000000e+00  3.750000e+02  1.200000e+02   \n",
       "25%    1.000000e+00      0.000000e+00  1.692000e+04  3.200000e+02   \n",
       "50%    1.000000e+00      0.000000e+00  2.034600e+04  3.200000e+02   \n",
       "75%    1.000000e+00      0.000000e+00  2.189400e+04  3.200000e+02   \n",
       "max    5.000000e+00      5.000000e+00  2.405200e+04  1.024000e+03   \n",
       "\n",
       "                C16           C17           C18           C19           C20  \\\n",
       "count  4.042897e+07  4.042897e+07  4.042897e+07  4.042897e+07  4.042897e+07   \n",
       "mean   6.010201e+01  2.112601e+03  1.432499e+00  2.271444e+02  5.321685e+04   \n",
       "std    4.729538e+01  6.094124e+02  1.326227e+00  3.510221e+02  4.995682e+04   \n",
       "min    2.000000e+01  1.120000e+02  0.000000e+00  3.300000e+01 -1.000000e+00   \n",
       "25%    5.000000e+01  1.863000e+03  0.000000e+00  3.500000e+01 -1.000000e+00   \n",
       "50%    5.000000e+01  2.323000e+03  2.000000e+00  3.900000e+01  1.000480e+05   \n",
       "75%    5.000000e+01  2.526000e+03  3.000000e+00  1.710000e+02  1.000930e+05   \n",
       "max    1.024000e+03  2.758000e+03  3.000000e+00  1.959000e+03  1.002480e+05   \n",
       "\n",
       "                C21  \n",
       "count  4.042897e+07  \n",
       "mean   8.338229e+01  \n",
       "std    7.028996e+01  \n",
       "min    1.000000e+00  \n",
       "25%    2.300000e+01  \n",
       "50%    6.100000e+01  \n",
       "75%    1.010000e+02  \n",
       "max    2.550000e+02  "
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "training_set.describe()\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<class 'pandas.core.frame.DataFrame'>\n",
    "RangeIndex: 40428967 entries, 0 to 40428966\n",
    "Data columns (total 24 columns):\n",
    "id                  float64\n",
    "click               int64\n",
    "hour                int64\n",
    "C1                  int64\n",
    "banner_pos          int64\n",
    "site_id             object\n",
    "site_domain         object\n",
    "site_category       object\n",
    "app_id              object\n",
    "app_domain          object\n",
    "app_category        object\n",
    "device_id           object\n",
    "device_ip           object\n",
    "device_model        object\n",
    "device_type         int64\n",
    "device_conn_type    int64\n",
    "C14                 int64\n",
    "C15                 int64\n",
    "C16                 int64\n",
    "C17                 int64\n",
    "C18                 int64\n",
    "C19                 int64\n",
    "C20                 int64\n",
    "C21                 int64\n",
    "dtypes: float64(1), int64(14), object(9)\n",
    "memory usage: 7.2+ GB"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "\n",
    "## info"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-10-17T07:20:57.140533Z",
     "start_time": "2018-10-17T07:20:57.116846Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 40428967 entries, 0 to 40428966\n",
      "Data columns (total 24 columns):\n",
      "id                  float64\n",
      "click               int64\n",
      "hour                int64\n",
      "C1                  int64\n",
      "banner_pos          int64\n",
      "site_id             object\n",
      "site_domain         object\n",
      "site_category       object\n",
      "app_id              object\n",
      "app_domain          object\n",
      "app_category        object\n",
      "device_id           object\n",
      "device_ip           object\n",
      "device_model        object\n",
      "device_type         int64\n",
      "device_conn_type    int64\n",
      "C14                 int64\n",
      "C15                 int64\n",
      "C16                 int64\n",
      "C17                 int64\n",
      "C18                 int64\n",
      "C19                 int64\n",
      "C20                 int64\n",
      "C21                 int64\n",
      "dtypes: float64(1), int64(14), object(9)\n",
      "memory usage: 7.2+ GB\n"
     ]
    }
   ],
   "source": [
    "\n",
    "training_set.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-10-17T07:20:58.206178Z",
     "start_time": "2018-10-17T07:20:57.144767Z"
    }
   },
   "outputs": [],
   "source": [
    "\n",
    "from sklearn import metrics\n",
    "from sklearn.externals import joblib\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "# from sklearn.cross_validation import train_test_split\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "from utils import load_df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# deal with the data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-10-17T07:20:58.223109Z",
     "start_time": "2018-10-17T07:20:58.211394Z"
    }
   },
   "outputs": [],
   "source": [
    "# 结果衡量\n",
    "def print_metrics(true_values, predicted_values):\n",
    "  \n",
    "    print (\"Accuracy: \", metrics.accuracy_score(true_values, predicted_values))\n",
    "    print (\"AUC: \", metrics.roc_auc_score(true_values, predicted_values))\n",
    "    print (\"Confusion Matrix: \", + metrics.confusion_matrix(true_values, predicted_values))\n",
    "    print (metrics.classification_report(true_values, predicted_values))\n",
    "\n",
    "\n",
    "# 拟合分类器\n",
    "def classify(classifier_class, train_input, train_targets):\n",
    "    classifier_object = classifier_class()\n",
    "    classifier_object.fit(train_input, train_targets)\n",
    "    return classifier_object\n",
    "\n",
    "# 模型存储\n",
    "def save_model(clf):\n",
    "    joblib.dump(clf, '/Users/songtao/personaldriveMac/ai_project/ai_csdn_20180917/ctr_prediction/out/model/classifier_ctr.pkl')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-10-17T07:24:08.008691Z",
     "start_time": "2018-10-17T07:20:58.227301Z"
    }
   },
   "outputs": [],
   "source": [
    "# train_data = load_df('train_small.csv').values\n",
    "train_data = load_df(train_filename).values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-10-17T07:24:08.044096Z",
     "start_time": "2018-10-17T07:24:08.022269Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[       0, 14102100,     1005, ...,       35,       -1,       79],\n",
       "       [       0, 14102100,     1005, ...,       35,   100084,       79],\n",
       "       [       0, 14102100,     1005, ...,       35,   100084,       79],\n",
       "       ...,\n",
       "       [       0, 14103023,     1005, ...,      813,       -1,       46],\n",
       "       [       1, 14103023,     1005, ...,       39,   100194,       23],\n",
       "       [       0, 14103023,     1005, ...,      431,   100084,      221]])"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_data[:,:]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Training & Saving Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-10-17T07:27:25.359627Z",
     "start_time": "2018-10-17T07:24:08.048462Z"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/songtao/pvenv_python2.7/lib/python2.7/site-packages/sklearn/linear_model/logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
      "  FutureWarning)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "('Accuracy: ', 0.8301626284320377)\n",
      "('AUC: ', 0.5)\n",
      "('Confusion Matrix: ', array([[10068786,        0],\n",
      "       [ 2059905,        0]]))\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/songtao/pvenv_python2.7/lib/python2.7/site-packages/sklearn/metrics/classification.py:1143: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n",
      "  'precision', 'predicted', average, warn_for)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0       0.83      1.00      0.91  10068786\n",
      "           1       0.00      0.00      0.00   2059905\n",
      "\n",
      "   micro avg       0.83      0.83      0.83  12128691\n",
      "   macro avg       0.42      0.50      0.45  12128691\n",
      "weighted avg       0.69      0.83      0.75  12128691\n",
      "\n"
     ]
    }
   ],
   "source": [
    "\n",
    "X_train, X_test, y_train, y_test = train_test_split(train_data[0::, 1::], train_data[0::, 0],\n",
    "                                                    test_size=0.3, random_state=0)\n",
    "\n",
    "classifier = classify(LogisticRegression, X_train, y_train)\n",
    "predictions = classifier.predict(X_test)\n",
    "print_metrics(y_test, predictions)\n",
    "save_model(classifier)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-10-17T07:27:25.374799Z",
     "start_time": "2018-10-17T07:27:25.366717Z"
    }
   },
   "outputs": [],
   "source": [
    "# 按照指定的格式生成结果\n",
    "def create_submission(ids, predictions, filename='/Users/songtao/personaldriveMac/ai_project/ai_csdn_20180917/ctr_prediction/out/submission_ctr.csv'):\n",
    "    submissions = np.concatenate((ids.reshape(len(ids), 1), predictions.reshape(len(predictions), 1)), axis=1)\n",
    "    df = DataFrame(submissions)\n",
    "    df.to_csv(filename, header=['id', 'click'], index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Loading and Testing model and Submitting Resutl"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2018-10-17T07:27:48.941874Z",
     "start_time": "2018-10-17T07:27:25.379636Z"
    }
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from pandas import DataFrame\n",
    "\n",
    "# classifier = joblib.load('classifier.pkl')\n",
    "classifier = joblib.load('/Users/songtao/personaldriveMac/ai_project/ai_csdn_20180917/ctr_prediction/out/model/classifier_ctr.pkl')\n",
    "# test_data_df = load_df('test.csv', training=False)\n",
    "test_data_df = load_df('/Users/songtao/personaldriveMac/ai_project/ai_csdn_20180917/ctr_prediction/datasets/test/test', training=False)\n",
    "ids = test_data_df.values[0:, 0]\n",
    "predictions = classifier.predict(test_data_df.values[0:, 1:])\n",
    "create_submission(ids, predictions)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.15"
  },
  "toc": {
   "base_numbering": 1.0,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": true
  },
  "varInspector": {
   "cols": {
    "lenName": 16.0,
    "lenType": 16.0,
    "lenVar": 40.0
   },
   "kernels_config": {
    "python": {
     "delete_cmd_postfix": "",
     "delete_cmd_prefix": "del ",
     "library": "var_list.py",
     "varRefreshCmd": "print(var_dic_list())"
    },
    "r": {
     "delete_cmd_postfix": ") ",
     "delete_cmd_prefix": "rm(",
     "library": "var_list.r",
     "varRefreshCmd": "cat(var_dic_list()) "
    }
   },
   "types_to_exclude": [
    "module",
    "function",
    "builtin_function_or_method",
    "instance",
    "_Feature"
   ],
   "window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
